{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip3 install sklearn-crfsuite"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import re\n",
    "import numpy as np\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['train_X', 'test_X', 'train_Y', 'test_Y'])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('bert/session-pos.pkl', 'rb') as fopen:\n",
    "    data = pickle.load(fopen)\n",
    "data.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X = data['train_X']\n",
    "test_X = data['test_X']\n",
    "train_Y = data['train_Y']\n",
    "test_Y = data['test_Y']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['word2idx', 'idx2word', 'tag2idx', 'idx2tag', 'char2idx'])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('bert/dictionary-pos.json') as fopen:\n",
    "    dictionary = json.load(fopen)\n",
    "dictionary.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "word2idx = dictionary['word2idx']\n",
    "idx2word = {int(k): v for k, v in dictionary['idx2word'].items()}\n",
    "tag2idx = dictionary['tag2idx']\n",
    "idx2tag = {int(k): v for k, v in dictionary['idx2tag'].items()}\n",
    "char2idx = dictionary['char2idx']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: 'PAD',\n",
       " 1: 'X',\n",
       " 2: 'PROPN',\n",
       " 3: 'AUX',\n",
       " 4: 'DET',\n",
       " 5: 'NOUN',\n",
       " 6: 'PRON',\n",
       " 7: 'VERB',\n",
       " 8: 'ADP',\n",
       " 9: 'PUNCT',\n",
       " 10: 'ADV',\n",
       " 11: 'CCONJ',\n",
       " 12: 'SCONJ',\n",
       " 13: 'NUM',\n",
       " 14: 'ADJ',\n",
       " 15: 'PART',\n",
       " 16: 'SYM'}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx2tag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'PAD': 0,\n",
       " 'X': 1,\n",
       " 'PROPN': 2,\n",
       " 'AUX': 3,\n",
       " 'DET': 4,\n",
       " 'NOUN': 5,\n",
       " 'PRON': 6,\n",
       " 'VERB': 7,\n",
       " 'ADP': 8,\n",
       " 'PUNCT': 9,\n",
       " 'ADV': 10,\n",
       " 'CCONJ': 11,\n",
       " 'SCONJ': 12,\n",
       " 'NUM': 13,\n",
       " 'ADJ': 14,\n",
       " 'PART': 15,\n",
       " 'SYM': 16}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tag2idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 12, 15, 16, 12, 17,\n",
       "       18, 19, 20, 21, 22, 23,  9, 24, 25, 26, 27,  5, 28, 29, 30, 31, 32,\n",
       "       33, 34, 35, 36, 18, 37, 38, 39, 40, 41, 42,  7, 43, 44, 45, 46])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_X[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_sentences = []\n",
    "for s in train_X:\n",
    "    train_sentences.append([idx2word[d] for d in s])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_sentences = []\n",
    "for s in test_X:\n",
    "    test_sentences.append([idx2word[d] for d in s])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_labels = []\n",
    "for s in train_Y:\n",
    "    train_labels.append([idx2tag[d] for d in s])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_labels = []\n",
    "for s in test_Y:\n",
    "    test_labels.append([idx2tag[d] for d in s])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tag.util import untag\n",
    "\n",
    "def features(sentence, index):\n",
    "    \"\"\" sentence: [w1, w2, ...], index: the index of the word \"\"\"\n",
    "    return {\n",
    "        'word': sentence[index],\n",
    "        'is_first': index == 0,\n",
    "        'is_last': index == len(sentence) - 1,\n",
    "        'prefix-1': sentence[index][0],\n",
    "        'prefix-2': sentence[index][:2],\n",
    "        'prefix-3': sentence[index][:3],\n",
    "        'suffix-1': sentence[index][-1],\n",
    "        'suffix-2': sentence[index][-2:],\n",
    "        'suffix-3': sentence[index][-3:],\n",
    "        'prev_word': '' if index == 0 else sentence[index - 1],\n",
    "        'prev_word-prefix-1': '' if index == 0 else sentence[index - 1][0],\n",
    "        'prev_word-prefix-2': '' if index == 0 else sentence[index - 1][:2],\n",
    "        'prev_word-prefix-3': '' if index == 0 else sentence[index - 1][:3],\n",
    "        'prev_word-suffix-1': '' if index == 0 else sentence[index - 1][-1],\n",
    "        'prev_word-suffix-2': '' if index == 0 else sentence[index - 1][-2:],\n",
    "        'prev_word-suffix-3': '' if index == 0 else sentence[index - 1][-3:],\n",
    "        'next_word-prefix-1': '' if index == len(sentence) - 1 else sentence[index + 1][0],\n",
    "        'next_word-prefix-2': '' if index == len(sentence) - 1 else sentence[index + 1][:2],\n",
    "        'next_word-prefix-3': '' if index == len(sentence) - 1 else sentence[index + 1][:3],\n",
    "        'next_word-suffix-1': '' if index == len(sentence) - 1 else sentence[index + 1][-1],\n",
    "        'next_word-suffix-2': '' if index == len(sentence) - 1 else sentence[index + 1][-2:],\n",
    "        'next_word-suffix-3': '' if index == len(sentence) - 1 else sentence[index + 1][-3:],\n",
    "        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],\n",
    "        'has_hyphen': '-' in sentence[index],\n",
    "        'is_numeric': sentence[index].isdigit(),\n",
    "    }\n",
    "\n",
    "def transform_to_dataset(tagged_sentences):\n",
    "    X, y = [], []\n",
    " \n",
    "    for tagged in tagged_sentences:\n",
    "        X.append([features(untag(tagged), index) for index in range(len(tagged))])\n",
    "        y.append([tag for _, tag in tagged])\n",
    " \n",
    "    return X, y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((97488, 50, 2), array([['Sembungan', 'PROPN'],\n",
       "        ['adalah', 'AUX'],\n",
       "        ['sebuah', 'DET'],\n",
       "        ['desa', 'NOUN'],\n",
       "        ['yang', 'PRON'],\n",
       "        ['terletak', 'VERB'],\n",
       "        ['di', 'ADP'],\n",
       "        ['kecamatan', 'NOUN'],\n",
       "        ['Kejajar', 'PROPN'],\n",
       "        [',', 'PUNCT'],\n",
       "        ['kabupaten', 'NOUN'],\n",
       "        ['Wonosobo', 'PROPN'],\n",
       "        [',', 'PUNCT'],\n",
       "        ['Jawa', 'PROPN'],\n",
       "        ['Tengah', 'PROPN'],\n",
       "        [',', 'PUNCT'],\n",
       "        ['Indonesia', 'PROPN'],\n",
       "        ['.', 'PUNCT'],\n",
       "        ['Sebuah', 'DET'],\n",
       "        ['serangan', 'NOUN'],\n",
       "        ['pengayauan', 'NOUN'],\n",
       "        ['biasanya', 'ADV'],\n",
       "        ['terjadi', 'VERB'],\n",
       "        ['di', 'ADP'],\n",
       "        ['ladang', 'NOUN'],\n",
       "        ['atau', 'CCONJ'],\n",
       "        ['dengan', 'ADP'],\n",
       "        ['membakar', 'VERB'],\n",
       "        ['sebuah', 'DET'],\n",
       "        ['rumah', 'NOUN'],\n",
       "        ['dan', 'CCONJ'],\n",
       "        ['memenggal', 'VERB'],\n",
       "        ['semua', 'DET'],\n",
       "        ['penghuninya', 'NOUN'],\n",
       "        ['ketika', 'SCONJ'],\n",
       "        ['mereka', 'PRON'],\n",
       "        ['melarikan', 'VERB'],\n",
       "        ['diri', 'NOUN'],\n",
       "        ['.', 'PUNCT'],\n",
       "        ['Perkembangan', 'NOUN'],\n",
       "        ['ini', 'DET'],\n",
       "        ['diikuti', 'VERB'],\n",
       "        ['oleh', 'ADP'],\n",
       "        ['helm', 'NOUN'],\n",
       "        ['Brodie', 'PROPN'],\n",
       "        ['yang', 'PRON'],\n",
       "        ['dipakai', 'VERB'],\n",
       "        ['tentara', 'NOUN'],\n",
       "        ['Imperium', 'PROPN'],\n",
       "        ['Britania', 'PROPN']], dtype='<U25'))"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = np.array([train_sentences, train_labels])\n",
    "train = np.transpose(train, [1, 2, 0])\n",
    "train.shape, train[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((24335, 50, 2), array([['Emma', 'PROPN'],\n",
       "        ['Summerton', 'PROPN'],\n",
       "        ['pada', 'ADP'],\n",
       "        ['bulan', 'NOUN'],\n",
       "        ['April', 'PROPN'],\n",
       "        ['2010', 'NUM'],\n",
       "        ['dan', 'CCONJ'],\n",
       "        ['tiga', 'NUM'],\n",
       "        ['gambar', 'NOUN'],\n",
       "        ['lainnya', 'ADJ'],\n",
       "        ['diambil', 'VERB'],\n",
       "        ['oleh', 'ADP'],\n",
       "        ['artis', 'ADJ'],\n",
       "        ['yang', 'PRON'],\n",
       "        ['dirilis', 'VERB'],\n",
       "        ['untuk', 'ADP'],\n",
       "        ['mempromosikan', 'VERB'],\n",
       "        ['album', 'NOUN'],\n",
       "        ['di', 'ADP'],\n",
       "        ['bulan', 'NOUN'],\n",
       "        ['Juli', 'PROPN'],\n",
       "        ['.', 'PUNCT'],\n",
       "        ['Sampul', 'PROPN'],\n",
       "        ['resmi', 'ADJ'],\n",
       "        ['album', 'NOUN'],\n",
       "        ['menunjukkan', 'VERB'],\n",
       "        ['Perry', 'PROPN'],\n",
       "        ['sedang', 'ADV'],\n",
       "        ['berbaring', 'VERB'],\n",
       "        ['telanjang', 'ADJ'],\n",
       "        ['di', 'ADP'],\n",
       "        ['awan', 'NOUN'],\n",
       "        ['kembang', 'NOUN'],\n",
       "        ['gula', 'NOUN'],\n",
       "        [',', 'PUNCT'],\n",
       "        ['dilukis', 'VERB'],\n",
       "        ['di', 'ADP'],\n",
       "        ['atas', 'NOUN'],\n",
       "        ['kanvas', 'ADJ'],\n",
       "        ['oleh', 'ADP'],\n",
       "        ['Will', 'PROPN'],\n",
       "        ['Cotton', 'PROPN'],\n",
       "        ['dan', 'CCONJ'],\n",
       "        ['dirilis', 'VERB'],\n",
       "        ['pada', 'ADP'],\n",
       "        ['tanggal', 'NOUN'],\n",
       "        ['21', 'NUM'],\n",
       "        ['Juli', 'PROPN'],\n",
       "        ['melalui', 'ADP'],\n",
       "        ['webstream', 'NOUN']], dtype='<U23'))"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test = np.array([test_sentences, test_labels])\n",
    "test = np.transpose(test, [1, 2, 0])\n",
    "test.shape, test[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 5min 51s, sys: 4.52 s, total: 5min 56s\n",
      "Wall time: 5min 56s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "train_X, train_Y = transform_to_dataset(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 27s, sys: 1.29 s, total: 1min 28s\n",
      "Wall time: 1min 28s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "test_X, test_Y = transform_to_dataset(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sklearn_crfsuite\n",
    "from sklearn_crfsuite import scorers\n",
    "from sklearn_crfsuite import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 18min 15s, sys: 1.58 s, total: 18min 16s\n",
      "Wall time: 18min 15s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,\n",
       "    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,\n",
       "    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,\n",
       "    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,\n",
       "    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=50,\n",
       "    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,\n",
       "    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "crf = sklearn_crfsuite.CRF(\n",
    "    algorithm='lbfgs',\n",
    "    c1=0.1,\n",
    "    c2=0.1,\n",
    "    max_iterations=50,\n",
    "    all_possible_transitions=True\n",
    ")\n",
    "crf.fit(train_X, train_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['PROPN',\n",
       " 'AUX',\n",
       " 'DET',\n",
       " 'NOUN',\n",
       " 'PRON',\n",
       " 'VERB',\n",
       " 'ADP',\n",
       " 'PUNCT',\n",
       " 'ADV',\n",
       " 'CCONJ',\n",
       " 'SCONJ',\n",
       " 'NUM',\n",
       " 'ADJ',\n",
       " 'PART',\n",
       " 'SYM',\n",
       " 'X']"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels = list(crf.classes_)\n",
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9179662916673392"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred = crf.predict(test_X)\n",
    "metrics.flat_f1_score(test_Y, y_pred,\n",
    "                      average='weighted', labels = labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "       PROPN    0.91484   0.93292   0.92379    227608\n",
      "         AUX    0.99502   1.00000   0.99751     10000\n",
      "         DET    0.92717   0.91874   0.92294     38839\n",
      "        NOUN    0.88129   0.89442   0.88781    268329\n",
      "        PRON    0.96846   0.93447   0.95117     48835\n",
      "        VERB    0.93068   0.92037   0.92550    124518\n",
      "         ADP    0.93504   0.94820   0.94157    119589\n",
      "       PUNCT    0.99918   0.99891   0.99905    182824\n",
      "         ADV    0.81648   0.81499   0.81573     47760\n",
      "       CCONJ    0.96229   0.90277   0.93158     37171\n",
      "       SCONJ    0.75337   0.73696   0.74508     15150\n",
      "         NUM    0.94127   0.90583   0.92321     41211\n",
      "         ADJ    0.77309   0.73989   0.75613     45666\n",
      "        PART    0.87705   0.82745   0.85153      5500\n",
      "         SYM    1.00000   0.97278   0.98620      3600\n",
      "           X    0.00000   0.00000   0.00000       150\n",
      "\n",
      "    accuracy                        0.91810   1216750\n",
      "   macro avg    0.85470   0.84055   0.84742   1216750\n",
      "weighted avg    0.91805   0.91810   0.91797   1216750\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(metrics.flat_classification_report(\n",
    "    test_Y, y_pred, labels=labels, digits=5\n",
    "))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "with open('crf-pos.pkl','wb') as fopen:\n",
    "    pickle.dump(crf,fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
